Original Author: Andreas Hoehn (Date: 2024-03-18)
Original scripts were produced by Andreas for workshops in Wales.
This file loads all pre-requisites and defines characteristics.
This ensures the script will read files from a specific location or write files to a specific location.
It should contain sub-folders for data, scripts, and output.
setwd("C:/Users/elizabeth.finn/OneDrive - Greater Manchester Combined Authority/SIPHER/syntheticPopulation/scripts/workshopVersions")
rm(list = ls()) # remove all objects from work space
gc(full = TRUE) # deep clean garbage
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 539056 28.8 1196954 64 686411 36.7
## Vcells 986041 7.6 8388608 64 1876649 14.4
dir() # check R has picked up relative file paths
## [1] "01_main.R" "02_data_table.R"
## [3] "03_load_link.R" "04_example_1.R"
## [5] "05_example_2.R" "RData"
## [7] "ROutput" "syntheticPopulation_v01"
## [9] "v01.html" "v01.Rmd"
This list required packages and install them if required.
# List of required packages
RequiredPackages <- c("data.table", # for dialect
"ggplot2", # for plotting
"sf", "scales") # for maps
# Ensure all packages are installed and loaded
.EnsurePackages <- function(packages_vector) {
new_package <- packages_vector[!(packages_vector %in%
installed.packages()[, "Package"])]
if (length(new_package) != 0) {
install.packages(new_package) }
sapply(packages_vector, suppressPackageStartupMessages(require),
character.only = TRUE)
}
.EnsurePackages(RequiredPackages)
## Loading required package: data.table
## Loading required package: ggplot2
## Loading required package: sf
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
## Loading required package: scales
## data.table ggplot2 sf scales
## TRUE TRUE TRUE TRUE
Sets out key definitions of variables that will be used later in the scripts.
# Initialize definitions ...
definitions <- list()
# Start benchmarking
definitions$run_start <- Sys.time()
# Specify survey wave and country subset
definitions$us_wave <- c("k") # "k" - US wave, do not change for now
definitions$geo_subset <- c("E") # "W" - for Wales (or "E" for England)
# might take a bit longer for England or England and Wales combined
# Configure visuals
definitions$fig_width <- 25 # = width of output figures in cm
definitions$fig_height <- 15 # = height of output figures in cm
definitions$fig_unit <- "cm" # = height of output figures in cm
definitions$map_cols <- c("seagreen","lightgrey","salmon")
This is mainly to state how efficiently and quickly the code ran.
definitions$run_end <- Sys.time() # end the benchmarking process
definitions$run_end - definitions$run_start # time it took to run
## Time difference of 0.02154398 secs
This demonstrates how to quickly load the Synthetic Population, Understanding Society (US) survey, and geo lookup.
# load sp #
sp_8cons <- data.table::fread("RData/sp_ind_wavek_census2011_est2020_8cons.csv")
# explore
str(sp_8cons) # general structure
## Classes 'data.table' and 'data.frame': 52853971 obs. of 2 variables:
## $ ZoneID: chr "E01000001" "E01000001" "E01000001" "E01000001" ...
## $ pidp : num 5.44e+08 9.57e+08 1.57e+09 4.77e+08 5.45e+08 ...
## - attr(*, ".internal.selfref")=<externalptr>
sp_8cons[1:5,] # first 5 rows of all columns
# subset for geography of interest defined previously -> definitions$geo_subset
sp_8cons[, ctr_code := substr(ZoneID, 1, 1)] # first letter of Zone ID
sp_8cons <- sp_8cons[ctr_code %in% definitions$geo_subset, ]
us_ind <- data.table::fread("RData/k_indresp.tab")
dim(us_ind) # general structure
## [1] 32008 3258
us_ind[1:5,1:5] # first 5 rows of the first 5 variables
data.table::setnames(us_ind, old = names(us_ind),
new = gsub(paste0("^",definitions$us_wave,"_"),"",names(us_ind)))
us_ind[1:5,1:5] # first 5 rows of the first 5 variables
us_ind_keep_var <- c("pidp", "hidp", # id for individuals
"sex", "age_dv", # age and sex
"sf12mcs_dv", # mental health indicator SF-12
"employ") # employment
# apply selection
us_ind <- us_ind[, ..us_ind_keep_var]
sp_8cons <- merge(sp_8cons, us_ind, by = "pidp", all.x = TRUE)
data.table::setkey(sp_8cons, ZoneID)
sp_8cons[1:5,] # check result first 10 rows of all columns
us_hh <- data.table::fread("RData/k_hhresp.tab")
dim(us_hh) # general structure
## [1] 18139 319
us_hh[1:5,1:5] # first 5 rows of the first 5 variables
# rename: let's get rid of start with "k_" #
data.table::setnames(us_hh, old = names(us_hh),
new = gsub(paste0("^",definitions$us_wave,"_"),"",names(us_hh)))
us_hh[1:5,1:5] # first 5 rows of the first 5 variables
# us_hh_keep_var <- c("hidp", # id for hh to link with individuals
# "xphsdct") # financial hardship: problems paying council tax
us_hh_keep_var <- c("hidp", # id for hh to link with individuals
"fihhmngrs_dv",
"xphsdct") # gross household income: month before interview
This is where you can also adapt the script for your own interests/purposes.
You can select variables of interest from the Understanding Society survey.
us_hh <- us_hh[, ..us_hh_keep_var]
# merge synthetic population with US data for households
sp_8cons <- merge(sp_8cons, us_hh, by = "hidp", all.x = TRUE)
# almost there!
sp_8cons
A geo-lookup is a file which describes the hierarchy of geographies.
e.g., LSOA’s/DZ –> MSOA/IZ –> LA –> GOR –> Countries –> UK
A source of these files is can be found here.
lookup_oa <- data.table::fread("RData/OA_to_Local_Authority_District_May_2021.csv")
str(lookup_oa)
## Classes 'data.table' and 'data.frame': 2661131 obs. of 14 variables:
## $ pcd7 : chr "AB1 0AA" "AB1 0AB" "AB1 0AD" "AB1 0AE" ...
## $ pcd8 : chr "AB1 0AA" "AB1 0AB" "AB1 0AD" "AB1 0AE" ...
## $ pcds : chr "AB1 0AA" "AB1 0AB" "AB1 0AD" "AB1 0AE" ...
## $ dointr : int 198001 198001 198001 199402 199012 199012 198001 198001 198001 198001 ...
## $ doterm : int 199606 199606 199606 199606 199207 199207 199606 199606 199606 199606 ...
## $ usertype: int 0 0 0 0 1 1 0 0 1 0 ...
## $ oa11cd : chr "S00090303" "S00090303" "S00090399" "S00091322" ...
## $ lsoa11cd: chr "S01006514" "S01006514" "S01006514" "S01006853" ...
## $ msoa11cd: chr "S02001237" "S02001237" "S02001237" "S02001296" ...
## $ ladcd : chr "S12000033" "S12000033" "S12000033" "S12000034" ...
## $ lsoa11nm: chr "Cults, Bieldside and Milltimber West - 02" "Cults, Bieldside and Milltimber West - 02" "Cults, Bieldside and Milltimber West - 02" "Dunecht, Durris and Drumoak - 01" ...
## $ msoa11nm: chr "Cults, Bieldside and Milltimber Wes" "Cults, Bieldside and Milltimber Wes" "Cults, Bieldside and Milltimber Wes" "Dunecht, Durris and Drumoak" ...
## $ ladnm : chr "Aberdeen City" "Aberdeen City" "Aberdeen City" "Aberdeenshire" ...
## $ ladnmw : chr "" "" "" "" ...
## - attr(*, ".internal.selfref")=<externalptr>
lookup_oa <- unique(lookup_oa[, .(lsoa11cd, msoa11cd, ladcd, ladnm)])
str(lookup_oa)
## Classes 'data.table' and 'data.frame': 42632 obs. of 4 variables:
## $ lsoa11cd: chr "S01006514" "S01006853" "S01006511" "S01006506" ...
## $ msoa11cd: chr "S02001237" "S02001296" "S02001236" "S02001236" ...
## $ ladcd : chr "S12000033" "S12000034" "S12000033" "S12000033" ...
## $ ladnm : chr "Aberdeen City" "Aberdeenshire" "Aberdeen City" "Aberdeen City" ...
## - attr(*, ".internal.selfref")=<externalptr>
sp_8cons <- merge(sp_8cons, lookup_oa, by.x = "ZoneID", by.y = "lsoa11cd",
all.x = TRUE)
sp_8cons[1:5,]
data.table::setcolorder(sp_8cons, neworder = c("ZoneID", "msoa11cd", "ladcd",
"ladnm", "ctr_code", "pidp", "hidp"))
sp_8cons[1:5,]
str(sp_8cons)
## Classes 'data.table' and 'data.frame': 45697898 obs. of 13 variables:
## $ ZoneID : chr "E01000001" "E01000001" "E01000001" "E01000001" ...
## $ msoa11cd : chr "E02000001" "E02000001" "E02000001" "E02000001" ...
## $ ladcd : chr "E09000001" "E09000001" "E09000001" "E09000001" ...
## $ ladnm : chr "City of London" "City of London" "City of London" "City of London" ...
## $ ctr_code : chr "E" "E" "E" "E" ...
## $ pidp : int 68029931 68538656 68155063 68197887 68202647 68231211 69013378 68278127 68336615 68361771 ...
## $ hidp : int 68102020 68455620 68537220 68741220 68768420 68904420 68938420 69020020 69217220 69353220 ...
## $ sex : int 1 1 1 2 2 1 1 2 1 2 ...
## $ age_dv : int 50 32 26 58 44 52 70 72 31 43 ...
## $ sf12mcs_dv : num 54.2 52.1 54.1 57.3 40.8 ...
## $ employ : int 1 1 1 1 1 1 2 2 1 1 ...
## $ fihhmngrs_dv: num 4352 4084 6896 1450 9204 ...
## $ xphsdct : int 2 2 2 2 2 1 2 2 2 2 ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "ZoneID"
This example will allow the investigation of absolute numbers and prevalence of financial hardship across different types of areas.
For example, how many individuals (or % of individuals) experience financial problems within a certain area?
Raw counts of individuals at LSOA level (simplest approach).
Prevalence of individuals at Local Authority level (more complicated).
“Problems paying council tax” will be the variable used to represent financial hardship.
# basic idea: an aggregation of a subset
sp_8cons[xphsdct == 1, ]
# ... an aggregation of a subset returning the number of observations
sp_8cons[xphsdct == 1, .(N_xphsdct = .N)]
# ... an aggregation of a subset returning the number of observations for DZs
sp_8cons[xphsdct == 1, .(N_xphsdct = .N), by = c("ZoneID") ]
# ... assigned to a new object
lsoa_hardship <- sp_8cons[xphsdct == 1, .(N_xphsdct = .N), by = c("ZoneID") ]
lsoa_hardship
# create and display plot
lsoa_hardship_plot <- ggplot2::ggplot(lsoa_hardship, aes(x = N_xphsdct)) +
geom_bar(width = 25)
lsoa_hardship_plot
## Warning: `position_stack()` requires non-overlapping x intervals.
Will use the “fihhmngrs_dv” continuous variable
sp_8cons[fihhmngrs_dv > 0, ]
# Aggregation of a subset returning the number of observations based on 'fihhmngrs_dv'
sp_8cons[fihhmngrs_dv > 0, .(N_fihhmngrs_dv = .N)]
# Aggregation of a subset returning the average value of 'fihhmngrs_dv' for each 'ZoneID'
avg_fihhmngrs <- sp_8cons[fihhmngrs_dv > 0, .(avg_fihhmngrs = mean(fihhmngrs_dv, na.rm = TRUE)), by = ZoneID]
avg_fihhmngrs
# Assign the result to a new object
lsoa_hardship <- sp_8cons[fihhmngrs_dv > 0, .(N_fihhmngrs_dv = .N), by = c("ZoneID")]
lsoa_hardship # Check the result
This variable indicates whether an individual is in paid employment (==1) or not (==2).
# basic idea: an aggregation of a subset
sp_8cons[employ == 1, ]
# ... an aggregation of a subset returning the number of observations
sp_8cons[employ == 1, .(employ = .N)]
# ... an aggregation of a subset returning the number of observations for DZs
sp_8cons[employ == 1, .(employ = .N), by = c("ZoneID") ]
# ... assigned to a new object
lsoa_hardship <- sp_8cons[employ == 1, .(employ = .N), by = c("ZoneID") ]
lsoa_hardship # looking good
# Create and display a plot
lsoa_hardship_plot <- ggplot2::ggplot(lsoa_hardship, aes(x = employ)) +
geom_bar(width = 25)
lsoa_hardship_plot
## Warning: `position_stack()` requires non-overlapping x intervals.